PageRank algorithm

Implement a scalable PageRank algorithm on the Google web graph dataset Google.txt. Use a random teleporting probability of 0.2. Each line in the file has two numbers separated by a space. This represents a directed edge in the graph. For example, "0 824020" is an edge from node 0 to node 824020. Calculate the PageRank value of node '99'.


In [1]:
from scipy.sparse import coo_matrix
import numpy as np

In [2]:
nodesID = {}
n = 0
line_count = 0

In [3]:
with open('Google.txt','r') as google:
    for line in google:
        line_count += 1
        if line.startswith('#') : continue
        tokens = line.strip().split('\t')
        #print tokens
        if tokens[0] not in nodesID: 
            nodesID[tokens[0]] = n
            n += 1
        
        if tokens[1] not in nodesID: 
            nodesID[tokens[1]] = n
            n += 1
            
print len(nodesID)
print n


875713
875713

In [4]:
col = []
row = []
value = []

In [5]:
line_count = 0
with open('Google.txt','r') as google:
    for line in google:
        line_count += 1
        if line.startswith('#') : continue
        tokens = line.strip().split('\t')
        url1 = nodesID[tokens[0]]
        url2 = nodesID[tokens[1]]

        col.append(url1)
        row.append(url2)
        value.append(1.0)

In [6]:
M = coo_matrix((value, (row,col)), shape=(n, n))
print M
print M.shape


  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (0, 1)	1.0
  (5, 1)	1.0
  (6, 1)	1.0
  (7, 1)	1.0
  (8, 1)	1.0
  (9, 1)	1.0
  (10, 1)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 1)	1.0
  (14, 1)	1.0
  (15, 1)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (0, 2)	1.0
  (16, 2)	1.0
  (8, 2)	1.0
  (9, 2)	1.0
  (17, 2)	1.0
  (18, 2)	1.0
  (19, 2)	1.0
  :	:
  (233421, 875711)	1.0
  (233422, 875711)	1.0
  (233423, 875711)	1.0
  (233424, 875711)	1.0
  (233425, 875711)	1.0
  (233426, 875711)	1.0
  (233427, 875711)	1.0
  (233428, 875711)	1.0
  (233429, 875711)	1.0
  (233430, 875711)	1.0
  (233431, 875711)	1.0
  (233432, 875711)	1.0
  (233433, 875711)	1.0
  (542444, 875712)	1.0
  (13651, 875712)	1.0
  (542445, 875712)	1.0
  (13652, 875712)	1.0
  (542446, 875712)	1.0
  (542447, 875712)	1.0
  (542448, 875712)	1.0
  (542449, 875712)	1.0
  (542450, 875712)	1.0
  (13656, 875712)	1.0
  (9430, 875712)	1.0
  (542451, 875712)	1.0
(875713, 875713)

In [7]:
inLink = M.sum(1)
print inLink


[[ 212.]
 [ 200.]
 [   4.]
 ..., 
 [   0.]
 [   0.]
 [   0.]]

In [8]:
outLink = M.sum(0).T
print outLink
print outLink.shape


[[  4.]
 [ 14.]
 [ 11.]
 ..., 
 [  2.]
 [ 15.]
 [ 12.]]
(875713, 1)

In [9]:
value = [1.0 / outLink[col[i], 0] for i, v in enumerate(value)]

In [10]:
M = coo_matrix((value, (row,col)), shape=(n, n))
print M
print M.shape


  (1, 0)	0.25
  (2, 0)	0.25
  (3, 0)	0.25
  (4, 0)	0.25
  (0, 1)	0.0714285714286
  (5, 1)	0.0714285714286
  (6, 1)	0.0714285714286
  (7, 1)	0.0714285714286
  (8, 1)	0.0714285714286
  (9, 1)	0.0714285714286
  (10, 1)	0.0714285714286
  (11, 1)	0.0714285714286
  (12, 1)	0.0714285714286
  (13, 1)	0.0714285714286
  (14, 1)	0.0714285714286
  (15, 1)	0.0714285714286
  (3, 1)	0.0714285714286
  (4, 1)	0.0714285714286
  (0, 2)	0.0909090909091
  (16, 2)	0.0909090909091
  (8, 2)	0.0909090909091
  (9, 2)	0.0909090909091
  (17, 2)	0.0909090909091
  (18, 2)	0.0909090909091
  (19, 2)	0.0909090909091
  :	:
  (233421, 875711)	0.0666666666667
  (233422, 875711)	0.0666666666667
  (233423, 875711)	0.0666666666667
  (233424, 875711)	0.0666666666667
  (233425, 875711)	0.0666666666667
  (233426, 875711)	0.0666666666667
  (233427, 875711)	0.0666666666667
  (233428, 875711)	0.0666666666667
  (233429, 875711)	0.0666666666667
  (233430, 875711)	0.0666666666667
  (233431, 875711)	0.0666666666667
  (233432, 875711)	0.0666666666667
  (233433, 875711)	0.0666666666667
  (542444, 875712)	0.0833333333333
  (13651, 875712)	0.0833333333333
  (542445, 875712)	0.0833333333333
  (13652, 875712)	0.0833333333333
  (542446, 875712)	0.0833333333333
  (542447, 875712)	0.0833333333333
  (542448, 875712)	0.0833333333333
  (542449, 875712)	0.0833333333333
  (542450, 875712)	0.0833333333333
  (13656, 875712)	0.0833333333333
  (9430, 875712)	0.0833333333333
  (542451, 875712)	0.0833333333333
(875713, 875713)

In [11]:
beta= 0.8
epsilon = 1./(10**11)
r = np.ones([n,1])
r = r/n
print np.sum(r)


1.0

In [12]:
for _ in xrange(250):
    old_r = r
    r  = beta * M * r
    for j in xrange(n):
        if inLink[j,0] == 0 :
            r[j] = 0
    
    S = r.sum()
    r = r +  (1 - S)/n
    
    if np.sum(np.abs(old_r - r)) < epsilon:
        print "{} iterations".format(_)
        old_r = r
        break
    else:
        old_r = r

print 'Pagerank("99"):', r[nodesID['99']]


95 iterations
Pagerank("99"): [  3.72200116e-07]

In [ ]: